library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(xgboost)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ dplyr::combine()         masks randomForest::combine()
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ lubridate::intersect()   masks base::intersect()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ ggplot2::margin()        masks randomForest::margin()
## ✖ lubridate::setdiff()     masks base::setdiff()
## ✖ dplyr::slice()           masks xgboost::slice()
## ✖ lubridate::union()       masks base::union()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(segmented)
source('functions.r')

load("Table_construction.Rdata")
set.seed(12)
features = features %>%
  # Add other useful information:
  inner_join(
    data_before %>% 
      select(person_id, screening_date, people) %>%
      unnest() %>%
      select(person_id, screening_date, race, sex, name),
    by = c("person_id","screening_date")
  ) %>%
  inner_join(features_on, by = c("person_id","screening_date")) %>%
  inner_join(outcomes, by = c("person_id","screening_date")) %>%
    # select(-c(offenses_within_30.y)) %>%

  # Create as many features as possible:
  mutate(
    raw_score = `Risk of Recidivism_raw_score`, # Adjust for violent/general
    decile_score = `Risk of Recidivism_decile_score`, # Adjust for violent/general
    p_jail30 = pmin(p_jail30,5),
    p_prison30 = pmin(p_jail30,5),
    p_prison = pmin(p_prison,5),
    p_probation = pmin(p_probation,5),
    race_black = if_else(race=="African-American",1,0),
    race_white = if_else(race=="Caucasian",1,0),
    race_hispanic = if_else(race=="Hispanic",1,0),
    race_asian = if_else(race=="Asian",1,0),
    race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
    
    # Subscales:
    crim_inv = p_arrest+ 
              p_jail30+
              p_prison+
              p_probation,
  
    # Filters (TRUE for obserations to keep)
    filt1 = `Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1, 
    filt2 = offenses_within_30 == 1,
    filt3 = !is.na(current_offense_date), 
    filt4 = current_offense_date <= current_offense_date_limit, 
    filt5 = p_current_age > 18 & p_current_age <= 65, 
    filt6 = crim_inv == 0
  )

Modelling the COMPAS Risk of Recidivism score with a quadratic poly.

#filter out any individuals with crim inv history 
features_age_poly = features %>%
  filter(filt1,filt5, filt6) 

lb_age = features_age_poly %>%
  group_by(p_current_age) %>%
  arrange(raw_score) %>%
  top_n(n=-1, wt=raw_score) # Fit lower bound on smallest value for each age

mdl_age = lm(raw_score ~ 
               I(p_current_age^2) + 
               p_current_age, 
             data=lb_age)

# More precision for paper
summary(mdl_age)
## 
## Call:
## lm(formula = raw_score ~ I(p_current_age^2) + p_current_age, 
##     data = lb_age)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.213689 -0.016285  0.006031  0.027235  0.080507 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -9.816e-02  5.402e-02  -1.817   0.0721 .  
## I(p_current_age^2)  4.291e-04  3.229e-05  13.287   <2e-16 ***
## p_current_age      -7.178e-02  2.728e-03 -26.313   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04547 on 103 degrees of freedom
## Multiple R-squared:  0.9894, Adjusted R-squared:  0.9892 
## F-statistic:  4789 on 2 and 103 DF,  p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "-9.81565971191264569073e-02" "4.29064498817958452602e-04" 
## [3] "-7.17796337351976898589e-02"
## Add f(age) to features
features = features %>%
  mutate(
    f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__age_poly = raw_score - f_age,
    filt7 = raw_score >= f_age - 0.05
  )
## Add same filters to lb_age 
lb_age = lb_age %>% 
    mutate(
    f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__age_poly = raw_score - f_age,
    filt7 = raw_score >= f_age - 0.05
    )

Plotting settings

xmin = 18
xmax = 65
xx = seq(xmin,xmax, length.out = 1000)

Age polynomial plot

Generate a preliminary plot of age vs COMPAS general score

ggplot()+
  geom_point(aes(x=p_current_age, raw_score, color = factor(filt7)),alpha=.3, data=lb_age) +
  geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("General score") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")

ggplot()+
  geom_point(aes(x=p_current_age, raw_score), color="#619CFF",alpha=.3, data=features_age_poly) +
  geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("General score") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")

features_age_spline = features %>% 
    filter(filt1, filt5, filt6, filt7) 
  

lb_filt = features_age_spline %>%
  group_by(p_current_age) %>%
    arrange(raw_score)%>%
    top_n(n=-1, wt = raw_score)

#filtered out points 
lb_outliers = rbind(
                #reason not included in lb_filt was  
                #age not above age poly
                setdiff(lb_age, lb_filt) %>%
                  filter(!filt7) %>% #below age poly
                  mutate(reason = "Below age polynomial")
                
                ) 

lb = lb_filt %>% 
     select(p_current_age, p_age_first_offense, raw_score) %>%
     mutate(reason = "Used to construct linear splines") %>%
     rbind(lb_outliers)

Generating linear splines to fit the lower bound Plottng individuals on new bottom edge produced by fitting to lb_filt individuals.

mdl_age_spline <- segmented(lm(raw_score ~ p_current_age, data = lb_filt), 
                            seg.Z = ~p_current_age, psi = list(p_current_age = c(39,58)),
  control = seg.control(display = FALSE)
)
#Add Filter 8
features = features %>%
  mutate(
    age_spline = predict(mdl_age_spline, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__age_spline = raw_score - age_spline,
    filt8 = raw_score >= age_spline - 0.05
  )
intercept(mdl_age_spline)
## $p_current_age
##                Est.
## intercept1 -0.28215
## intercept2 -0.98923
## intercept3 -1.45920
slope(mdl_age_spline)
## $p_current_age
##             Est.    St.Err. t value CI(95%).l CI(95%).u
## slope1 -0.052747 0.00083055 -63.509 -0.054394 -0.051100
## slope2 -0.032251 0.00081987 -39.337 -0.033876 -0.030625
## slope3 -0.022852 0.00090704 -25.194 -0.024650 -0.021054
summary.segmented(mdl_age_spline)$psi
##                    Initial     Est.    St.Err
## psi1.p_current_age      39 34.49774 0.4479371
## psi2.p_current_age      58 49.99995 1.1230493

Note: the reason why the number of points below the age polynomial appear different between the two plots is that the first plot (displaying the points used to construct the linear spline) show only data points where criminal involvement = 0, whereas the second plot does not have this constraint. In other words, the first plot has filt6 applied whereas the second plot does not.

break1 =  summary.segmented(mdl_age_spline)$psi[1,2]
break2 =  summary.segmented(mdl_age_spline)$psi[2,2]

xx1 = seq(xmin,break1, length.out=1000)
xx2 = seq(break1,break2, length.out=1000)
xx3 = seq(break2,xmax, length.out=1000)

age_spline_recid = 
  ggplot()+
  geom_point(aes(x=p_current_age, raw_score, color= factor(reason)),alpha=.5, data=lb ) +
  scale_color_manual(values=c("red", "grey25")) + 
  geom_line(aes(x=xx1, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx1))),
                color="darkorange1", size = 1) +
  geom_line(aes(x=xx2, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx2))),
                color="cyan3", size = 1) +
  geom_line(aes(x=xx3, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx3))),
                color="seagreen3", size = 1) +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("\n Age at COMPAS screening date") +
  ylab("General Score \n") +
  theme(text = element_text(size=9),
        axis.text=element_text(size=7),
        legend.position = c(.95,.95),
        legend.title = element_blank(),
        legend.justification = c("right", "top"),
        legend.key = element_rect(fill = "aliceblue"),
        legend.background = element_rect(fill="aliceblue",
                                  size=0.5, linetype="solid")
        )

age_spline_recid

# Age spline with all data (filters 1,5 still applied for data quality)
# Red points are below the age polynomial not age spline
age_spline_recid_all = 
  ggplot()+
  geom_point(aes(x=p_current_age, raw_score), color="#F8766D", data=features %>% filter(filt1,filt5,!filt7)) + # Age outliers
  geom_point(aes(x=p_current_age, raw_score), color="#619CFF", alpha=.3, data=features %>% filter(filt1,filt5,filt7)) + # Not age outliers
  geom_line(aes(x=xx1, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx1))),
                color="#F8766D", size = 1) +
  geom_line(aes(x=xx2, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx2))),
                color="#F8766D", size = 1) +
  geom_line(aes(x=xx3, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx3))),
                color="#F8766D", size = 1) +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("\n Age at COMPAS screening date") +
  ylab("General score \n") +
  theme(text = element_text(size=9),
        axis.text=element_text(size=7),
        legend.position="top") 

age_spline_recid_all

ggsave("Figures/age_LB_general.pdf", plot = age_spline_recid, width = 3.5, height = 2.5, units = "in")
ggsave("Figures/age_LB_alldata_general.pdf", plot = age_spline_recid_all, width = 3.5, height = 2.5, units = "in")

Examine Criminal Involvement lower bound

Look at COMPAS age remainder vs number of prior arrests (important item in the crim inv subscale)

prior_charges_LB_general = features %>%
          filter(filt1,filt3) %>% 
          select(p_charge, raw_score__age_spline, filt8) %>%
          ggplot() +
          geom_point(aes(x=p_charge,y=raw_score__age_spline,color=filt8),alpha=.5) +
          xlim(c(0,45))+
          theme_bw()+
          xlab("Number of prior charges") +
          ylab(expression(General~score~-~f[age]))  +
          theme(text = element_text(size=12),
                axis.text=element_text(size=12),
                legend.position="top") +
          scale_color_manual(name=element_blank(),
                               breaks=c("TRUE", "FALSE"),
                               labels=c(expression(Above~f[age]), 
                                        expression(Below~f[age])),
                               values=c("TRUE"="#619CFF","FALSE"="#00BA38"))

prior_charges_LB_general
## Warning: Removed 69 rows containing missing values (geom_point).

prior_arrests_LB_general = features %>%
          filter(filt1,filt3) %>% 
          select(p_arrest, raw_score__age_spline, filt8) %>%
          ggplot() +
          geom_point(aes(x=p_arrest,y=raw_score__age_spline,color=filt8),alpha=.5) +
          xlim(c(0,45))+
          theme_bw()+
          xlab("Number of prior arrests") +
          ylab(expression(General~score~-~f[age]))  +
          theme(text = element_text(size=12),
                axis.text=element_text(size=12),
                legend.position="top") +
          scale_color_manual(name=element_blank(),
                               breaks=c("TRUE", "FALSE"),
                               labels=c(expression(Above~f[age]), 
                                        expression(Below~f[age])),
                               values=c("TRUE"="#619CFF","FALSE"="#00BA38"))

prior_arrests_LB_general
## Warning: Removed 2 rows containing missing values (geom_point).

ggsave("Figures/prior_arrests_LB_general.pdf", plot = prior_arrests_LB_general, width = 3.5, height = 2.5, units = "in")
## Warning: Removed 2 rows containing missing values (geom_point).
ggsave("Figures/prior_charges_LB_general.pdf", plot=prior_charges_LB_general, width = 3.5, height = 2.5, units = "in")
## Warning: Removed 69 rows containing missing values (geom_point).

Look at COMPAS age remainder vs sum of items in Criminal Involvement subscale

features %>%
  filter(filt1,filt3, filt5) %>% 
  select(crim_inv, raw_score__age_spline, filt8) %>%
  ggplot() +
    geom_point(aes(x=crim_inv,y=raw_score__age_spline,color=filt8),alpha=.5) +
    xlim(c(0,45))+
    theme_bw()+
    xlab("Sum of Criminal Involvement Components") +
    ylab(expression(General~score~-~f[age]))  +
    theme(text = element_text(size=12),
          axis.text=element_text(size=12),
          legend.position="top") +
    scale_color_manual(name=element_blank(),
                         breaks=c("TRUE", "FALSE"),
                         labels=c(expression(Above~f[age]), expression(Below~f[age])),
                         values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
## Warning: Removed 9 rows containing missing values (geom_point).

ggsave("Figures/crim_inv_LB_general_arrests.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 9 rows containing missing values (geom_point).

Try various weightings of inputs criminal involvement subscale (i.e., instead of evenly weighted) (optional)

dir.create("./Figures/crim_inv_weightings_general/")
crim_inv_weights_perms = expand.grid(arrest=seq(1,3,1), 
                                     jail30=seq(1,3,1), 
                                     prison=seq(1,3,1), 
                                     probation=seq(1,3,1))
for (i in 1:nrow(crim_inv_weights_perms)){
  
weights = unlist(crim_inv_weights_perms[i,])
title = paste0('arrest=',weights['arrest'],
                '_jail30=',weights['jail30'],
                '_prison=',weights['prison'],
                '_probation=',weights['probation'])

p <- features %>%
  mutate(crim_inv = 
      weights['arrest']*p_arrest+
      weights['jail30']*p_jail30+
      weights['prison']*p_prison+
      weights['probation']*p_probation) %>%
  filter(filt1,filt3, filt5) %>% 
  select(crim_inv, raw_score__age_spline, filt8) %>%
  ggplot() +
    geom_point(aes(x=crim_inv,y=raw_score__age_spline,color=filt8),alpha=.5) +
    xlim(c(0,45))+
    theme_bw()+
    xlab("Sum of Criminal Involvement Components") +
    ylab(expression(General~score~-~f[age]))  +
    theme(text = element_text(size=12),
          plot.title = element_text(size=8),
          axis.text=element_text(size=12),
          legend.position="top") +
    scale_color_manual(name=element_blank(),
                         breaks=c("TRUE", "FALSE"),
                         labels=c(expression(Above~f[age]), expression(Below~f[age])),
                         values=c("TRUE"="#619CFF","FALSE"="#00BA38")) +
  ggtitle(title)





  ggsave(paste0("Figures/crim_inv_weightings_general/",title,".png"),plot=p,width = 3.5, height = 2.5, units = "in")
}

Fit lower bound to COMPAS age remainder vs. the sum of criminal involvement subscale items

# filter out datapoints low quality
# filt 8 will be applied later 
features_crim_inv_poly = features %>% 
                        filter(filt1, filt3, filt5, crim_inv <= 40) # convert to actual filter later


lb_crim_inv = features_crim_inv_poly %>%
  filter(filt8) %>%  
  select(-filt8) %>%
  # select(crim_inv, raw_score__age_spline) %>%
  group_by(crim_inv) %>%
  arrange(raw_score__age_spline) %>%
  top_n(n=-4, wt=raw_score__age_spline) # Fit lower bound on smallest value for each crim inv component

# write.csv(lb_crim_inv, "../lb_crim_inv.csv")

# predict the compas age remainder
mdl_crim_inv = lm(raw_score__age_spline ~ 
                 I(crim_inv^3) + 
                 I(crim_inv^2) + 
                  crim_inv, 
              data=lb_crim_inv)
 
# More precision for paper
summary(mdl_crim_inv)
## 
## Call:
## lm(formula = raw_score__age_spline ~ I(crim_inv^3) + I(crim_inv^2) + 
##     crim_inv, data = lb_crim_inv)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.06727 -0.10168  0.03038  0.15386  0.57592 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.816e-01  7.165e-02  -2.534   0.0122 *  
## I(crim_inv^3)  2.570e-05  1.532e-05   1.678   0.0954 .  
## I(crim_inv^2) -2.222e-03  9.280e-04  -2.394   0.0178 *  
## crim_inv       1.193e-01  1.578e-02   7.559 3.03e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2502 on 159 degrees of freedom
## Multiple R-squared:  0.9136, Adjusted R-squared:  0.9119 
## F-statistic: 560.1 on 3 and 159 DF,  p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_crim_inv$coefficients) # More precision for paper
## [1] "-1.81561193977485685336e-01" "2.57044321819749947103e-05" 
## [3] "-2.22161522877928341302e-03" "1.19300684990506777883e-01"
 ## Add g(crim_inv) to features
# note that g(crim_inv) fits the lower bound of the COMPAS GENERAL AGE REMAINDER 
features = features %>%
  mutate(
    g_crim_inv = predict(mdl_crim_inv, newdata=data.frame(crim_inv=crim_inv)),
    raw_score__age_spline__g_vio_hist = raw_score__age_spline - g_crim_inv ,
    filt9 = raw_score__age_spline >= g_crim_inv - 0.05
  )

## Add same filters to lb_crim_inv
lb_crim_inv = lb_crim_inv %>%
  mutate(
    g_crim_inv = predict(mdl_crim_inv, newdata=data.frame(crim_inv=crim_inv)),
    raw_score__age_spline__g_vio_hist = raw_score__age_spline - g_crim_inv ,
    filt9 = raw_score__age_spline >= g_crim_inv - 0.05
  )

Plotting settings

xmin_crim_inv = 0
xmax_crim_inv = 40
xx_crim_inv = seq(xmin_crim_inv,xmax_crim_inv, length.out = 1000)

Age polynomial plot

Generate a preliminary plot of crim inv vs COMPAS general score remainder

ggplot()+
  geom_point(aes(x=crim_inv, raw_score__age_spline, color = factor(filt9)),alpha=.3, data=lb_crim_inv) +
  geom_line(aes(x=xx_crim_inv, predict(mdl_crim_inv, newdata=data.frame(crim_inv=xx_crim_inv))),color="#F8766D") +
  theme_bw()+
  xlim(xmin_crim_inv,xmax_crim_inv)+
  xlab("Criminal Involvement Subscale Sum") +
  ylab("General score - f_age") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")

ggplot()+
  geom_point(aes(x=crim_inv, raw_score__age_spline), color="#619CFF",alpha=.3, data=features_crim_inv_poly) +
  geom_line(aes(x=xx_crim_inv, predict(mdl_crim_inv, newdata=data.frame(crim_inv=xx_crim_inv))),color="#F8766D") +
  theme_bw()+
  xlim(xmin_crim_inv,xmax_crim_inv)+
  xlab("Criminal Involvement Subscale Sum") +
  ylab("General score - f_age") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")

# unfiltered 
ggplot()+
  geom_point(aes(x=crim_inv, raw_score__age_spline), color="#619CFF",alpha=.3, data=features) +
  theme_bw()+
  xlim(xmin_crim_inv,xmax_crim_inv)+
  ylim(-0.5, 4) + 
  xlab("Criminal Involvement Subscale Sum -- Filtered ") +
  ylab("General score - f_age") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")
## Warning: Removed 38 rows containing missing values (geom_point).

# filtered 
ggplot()+
  geom_point(aes(x=crim_inv, raw_score__age_spline, color=factor(filt9)),alpha=.3, data=features) +
  theme_bw()+
  xlim(xmin_crim_inv,xmax_crim_inv) +
  ylim(-0.5, 4) + 
  xlab("Criminal Involvement Subscale Sum -- Filtered ") +
  ylab("General score - f_age") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")
## Warning: Removed 38 rows containing missing values (geom_point).

Construct crim_inv spline

Want to keep track of points that are below the age splines and points that are below the crim inv polynomial

features_crim_inv_spline = features %>% 
    filter(filt1, filt3, filt5, filt8, filt9, crim_inv <=40) 
  

# points we will use to construct linear crim inv spline 
lb_filt_crim_inv = features_crim_inv_spline %>%
  group_by(crim_inv) %>%
    arrange(raw_score__age_spline)%>%
    top_n(n=-1, wt = raw_score__age_spline)

# points filtered out because they are below the age spline
lb_age_spline_outliers = features %>% 
                         filter(filt1, filt3, filt5, !filt8, crim_inv <= 40) %>% 
                         mutate(reason = "Below age spline")

# points filtered out because they are below the crim. inv. poly.
lb_crim_inv_poly_outliers = features %>% 
                            filter(filt1, filt3, filt5, filt8, !filt9, crim_inv <= 40) %>%
                            mutate(reason = "Below crim. inv. polynomial")

lb_crim_inv_all = lb_filt_crim_inv %>% 
     mutate(reason = "Used to construct linear splines") %>%
     rbind(list(lb_age_spline_outliers, lb_crim_inv_poly_outliers)) %>%
     select(person_id, crim_inv, raw_score__age_spline, reason) 

rm(lb_age_spline_outliers, lb_crim_inv_poly_outliers)

Generating linear splines to fit the lower bound of the COMPAS age remainder Plottng individuals on new bottom edge produced by fitting to lb_filt individuals.

mdl_crim_inv_spline <- segmented(lm(raw_score__age_spline ~ crim_inv, data = lb_filt_crim_inv), 
                            seg.Z = ~crim_inv, psi = list(crim_inv = c(9)),
  control = seg.control(display = FALSE)
)
#Add Filter 10
features = features %>%
  mutate(
    crim_inv_spline = predict(mdl_crim_inv_spline, newdata=data.frame(crim_inv=crim_inv)),
    raw_score__age_spline__crim_inv_spline = raw_score__age_spline - crim_inv_spline,
    filt10 = raw_score__age_spline >= crim_inv_spline - 0.05
  )
intercept(mdl_crim_inv_spline)
## $crim_inv
##                Est.
## intercept1 -0.12241
## intercept2  0.50939
slope(mdl_crim_inv_spline)
## $crim_inv
##            Est.   St.Err. t value CI(95%).l CI(95%).u
## slope1 0.097854 0.0051412  19.033  0.087437  0.108270
## slope2 0.055734 0.0022495  24.776  0.051176  0.060292
summary.segmented(mdl_crim_inv_spline)$psi
##               Initial     Est.   St.Err
## psi1.crim_inv       9 14.99998 1.355633

Plot crim inv spline

break1 =  summary.segmented(mdl_crim_inv_spline)$psi[1,2]

xx1 = seq(xmin_crim_inv,break1, length.out=1000)
xx2 = seq(break1,xmax_crim_inv, length.out=1000)

crim_inv_spline_recid = 
  ggplot()+
  geom_point(aes(x=crim_inv, raw_score__age_spline, color= factor(reason)),alpha=.5, data=lb_crim_inv_all ) +
  scale_color_manual(values=c("green", "red", "grey25")) + 
  geom_line(aes(x=xx1, y = predict(mdl_crim_inv_spline, newdata=data.frame(crim_inv=xx1))),
                color="darkorange1", size = 1) +
  geom_line(aes(x=xx2, y = predict(mdl_crim_inv_spline, newdata=data.frame(crim_inv=xx2))),
                color="cyan3", size = 1) +
  theme_bw()+
  xlim(xmin_crim_inv,xmax_crim_inv)+
  xlab("Sum of Criminal Involvement \n Subscale Components") +
  ylab("General Score - f_age \n") +
  theme(text = element_text(size=9),
        axis.text=element_text(size=7),
        legend.position = c(0,1),
        legend.title = element_blank(),
        legend.justification = c("left", "top"),
        legend.key = element_rect(fill = NA),
        legend.background = element_rect(fill=NA,
        size=0.2,
        linetype="solid")
        )

crim_inv_spline_recid

# Crim inv spline with all data (filters 1,5 still applied for data quality)
crim_inv_spline_recid_all = 
  ggplot()+
  
  geom_point(aes(x=crim_inv, raw_score__age_spline), # age spline outliers
             color="green", 
             data=features %>% filter(filt1, filt3, filt5,!filt8)) + 
  geom_point(aes(x=crim_inv, raw_score__age_spline), # crim inv poly outliers
             color="red", alpha=.3, 
             data=features %>% filter(filt1, filt3, filt5,filt8, !filt9)) + 
  geom_point(aes(x=crim_inv, raw_score__age_spline), # not outliers
             color="#619CFF", alpha=.3, 
             data=features %>% filter(filt1, filt3, filt5,filt8, filt9)) + 

  geom_line(aes(x=xx_crim_inv, y = predict(mdl_crim_inv_spline, newdata=data.frame(crim_inv=xx_crim_inv))),
                color="#F8766D", size = 1) +
  theme_bw()+
  xlim(xmin_crim_inv,xmax_crim_inv)+
  xlab("Sum of Criminal Involvement\n Subscale Components") +
  ylab("General score - f_age \n") +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="top") 

crim_inv_spline_recid_all
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_point).

ggsave("Figures/crim_inv_spline_general.pdf", plot = crim_inv_spline_recid, width = 3.5, height = 2.5, units = "in")
ggsave("Figures/crim_inv_spline_alldata_general.pdf", plot = crim_inv_spline_recid_all, width = 3.5, height = 2.5, units = "in")
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_point).

Predictions of (raw - age polynomial) using several ML methods

There are a few groups of predictors we will use: * Group 0: without using age variables or race * Group 1: without using current age or race but with age at first arrest * Group 2: without using current age but with race and age at first arrest * Group 3: without using race but with current age and age at first arrest * Group 4: using age variables and race * Group 5: test

#### Generic stuff (applies to all models)

## Filter rows
features_filt = features %>%
  filter(filt1, filt3) 

## Set parameters (each combination will be run)
# xgboost
param <- list(objective = "reg:linear",
              eval_metric = "rmse",
              eta = c(.05,.1),
              gamma = c(.5, 1), 
              max_depth = c(2,5),
              min_child_weight = c(5,10),
              subsample = c(1),
              colsample_bytree = c(1)
)

# svm
param_svm = list(
  type = 'eps-regression',
  cost = c(0.5,1,2),
  epsilon = c(0.5,1,1.5),
  gamma_scale = c(0.5,1,2)
)

res_rmse = data.frame(Group = 0:8, lm = NA, xgb = NA, rf = NA, svm = NA)

Group 0 models: predicting (raw score - age spline) without using current age, age at first offense or race

### Create group 0 training data

## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    # p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison,
    p_probation,
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.42430 -0.45339 -0.05984  0.38049  2.37584 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.875979   0.006968 125.717   <2e-16 ***
## p_arrest     0.059410   0.001993  29.809   <2e-16 ***
## p_jail30    -0.084580   0.040735  -2.076   0.0379 *  
## p_prison     0.171827   0.008814  19.494   <2e-16 ***
## p_probation  0.072067   0.008348   8.633   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5765 on 9037 degrees of freedom
## Multiple R-squared:  0.4038, Adjusted R-squared:  0.4036 
## F-statistic:  1530 on 4 and 9037 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==0,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  14          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==0,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(784)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==0,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             20              
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.4"
res_rmse[res_rmse$Group==0,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)

Group 1 models: predicting (raw score - age spline) without using current age or race

### Create group 1 training data

## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison,
    p_probation,
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.09844 -0.44649 -0.06349  0.36243  2.54534 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.0933977  0.0173633  62.972  < 2e-16 ***
## p_age_first_offense -0.0075173  0.0005509 -13.645  < 2e-16 ***
## p_arrest             0.0556322  0.0019923  27.924  < 2e-16 ***
## p_jail30            -0.0663080  0.0403463  -1.643      0.1    
## p_prison             0.1767267  0.0087328  20.237  < 2e-16 ***
## p_probation          0.0661198  0.0082754   7.990 1.52e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5707 on 9036 degrees of freedom
## Multiple R-squared:  0.4159, Adjusted R-squared:  0.4156 
## F-statistic:  1287 on 5 and 9036 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  16          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "1"         
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(784)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             19              
## type        "eps-regression"
## cost        "0.5"           
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.3333333"
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)

Group 2 models: predicting (raw score - age spline) without using age variables but with race

### Create group 2 training data

## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison,
    p_probation,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.93704 -0.43505 -0.06018  0.35537  2.40025 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.7715142  0.0294567  26.192  < 2e-16 ***
## p_age_first_offense -0.0056361  0.0005609 -10.048  < 2e-16 ***
## p_arrest             0.0535447  0.0019642  27.260  < 2e-16 ***
## p_jail30            -0.0502479  0.0396846  -1.266  0.20548    
## p_prison             0.1687020  0.0086064  19.602  < 2e-16 ***
## p_probation          0.0653331  0.0081413   8.025 1.14e-15 ***
## race_black           0.3672723  0.0255076  14.399  < 2e-16 ***
## race_white           0.2504831  0.0257782   9.717  < 2e-16 ***
## race_hispanic        0.1000628  0.0309000   3.238  0.00121 ** 
## race_asian           0.0958429  0.0852294   1.125  0.26082    
## race_native          0.2440703  0.1087937   2.243  0.02489 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.561 on 9031 degrees of freedom
## Multiple R-squared:  0.4358, Adjusted R-squared:  0.4352 
## F-statistic: 697.7 on 10 and 9031 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

data.frame(xgboost = pred, compas=features_filt$raw_score) %>%
  ggplot() +
  geom_point(aes(x=xgboost,y=compas), alpha=.3) +
  theme_bw()+
  xlab("XGBoost prediction") +
  ylab("COMPAS raw score")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(6778)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             21              
## type        "eps-regression"
## cost        "2"             
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.1818182"
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 3 models: predicting (raw score - age spline) without using race but with age variables

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison,
    p_probation,
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.83634 -0.44132 -0.06739  0.35870  2.55283 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.059171   0.017881  59.235  < 2e-16 ***
## p_current_age        0.009175   0.001203   7.627 2.64e-14 ***
## p_age_first_offense -0.016188   0.001263 -12.822  < 2e-16 ***
## p_arrest             0.051133   0.002072  24.681  < 2e-16 ***
## p_jail30            -0.039529   0.040372  -0.979    0.328    
## p_prison             0.159596   0.008990  17.752  < 2e-16 ***
## p_probation          0.052787   0.008432   6.260 4.03e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5689 on 9035 degrees of freedom
## Multiple R-squared:  0.4196, Adjusted R-squared:  0.4192 
## F-statistic:  1089 on 6 and 9035 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(999)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  14          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

ggsave("Figures/raw-fage_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(5)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             20              
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.2857143"
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 4 models: predicting (raw score - age spline) using age variables and race

### Create group 2 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison,
    p_probation,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76345 -0.43064 -0.06504  0.35500  2.40477 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.737217   0.029694  24.827  < 2e-16 ***
## p_current_age        0.009171   0.001185   7.736 1.13e-14 ***
## p_age_first_offense -0.014257   0.001247 -11.435  < 2e-16 ***
## p_arrest             0.049026   0.002043  23.995  < 2e-16 ***
## p_jail30            -0.023093   0.039711  -0.582 0.560901    
## p_prison             0.151392   0.008865  17.077  < 2e-16 ***
## p_probation          0.052113   0.008293   6.284 3.45e-10 ***
## race_black           0.367906   0.025425  14.470  < 2e-16 ***
## race_white           0.245145   0.025704   9.537  < 2e-16 ***
## race_hispanic        0.104469   0.030805   3.391 0.000699 ***
## race_asian           0.103146   0.084958   1.214 0.224749    
## race_native          0.235458   0.108447   2.171 0.029943 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5592 on 9030 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4389 
## F-statistic: 643.8 on 11 and 9030 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(23)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  16          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "1"         
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=12),
        axis.text=element_text(size=12))

ggsave("Figures/raw-fage_xgboost_gp4_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

highlight = data.frame(
  person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
  screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
  highlight = TRUE
)

df_plot = features_filt %>%
  bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
  left_join(highlight, by = c("person_id","screening_date")) %>%
  mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
  mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))

person_id_text_topright = c(8375, 11231, 1515)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(1394, 1497)
person_id_text_botright = c(11312, 6886, 8491, 10774)
person_id_text_botleft = c(799)

ggplot() +
  geom_point(aes(x=xgboost,y=raw_score, color=highlight),  alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
  geom_point(aes(x=xgboost,y=raw_score, color=highlight),  data = filter(df_plot, highlight=="In Table 5")) +
  theme_bw()+
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) + 
  xlab(expression(XGBoost~pred.~of~general~score~-~f[age])) +
  ylab("General score")+
  theme(
    text = element_text(size=12),
    axis.text=element_text(size=12),
    #legend.position = "top",
    legend.position="none") +
  scale_color_discrete(name = element_blank()) +
  xlim(0.2,3.5)

ggsave("Figures/xgboost_rawScore_annotate_general.pdf",width = 4, height = 4, units = "in")

Model 3: random forest

set.seed(3720)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             21              
## type        "eps-regression"
## cost        "2"             
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.1666667"
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 5 models: test

### Create group 5 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison30,
    p_prison,
    p_probation,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76345 -0.43064 -0.06504  0.35500  2.40477 
## 
## Coefficients: (1 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.737217   0.029694  24.827  < 2e-16 ***
## p_current_age        0.009171   0.001185   7.736 1.13e-14 ***
## p_age_first_offense -0.014257   0.001247 -11.435  < 2e-16 ***
## p_arrest             0.049026   0.002043  23.995  < 2e-16 ***
## p_jail30            -0.023093   0.039711  -0.582 0.560901    
## p_prison30                 NA         NA      NA       NA    
## p_prison             0.151392   0.008865  17.077  < 2e-16 ***
## p_probation          0.052113   0.008293   6.284 3.45e-10 ***
## race_black           0.367906   0.025425  14.470  < 2e-16 ***
## race_white           0.245145   0.025704   9.537  < 2e-16 ***
## race_hispanic        0.104469   0.030805   3.391 0.000699 ***
## race_asian           0.103146   0.084958   1.214 0.224749    
## race_native          0.235458   0.108447   2.171 0.029943 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5592 on 9030 degrees of freedom
## Multiple R-squared:  0.4396, Adjusted R-squared:  0.4389 
## F-statistic: 643.8 on 11 and 9030 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==5,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
## Warning in predict.lm(mdl_lm, newdata = train): prediction from a rank-
## deficient fit may be misleading

Model 2: xgboost

set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==5,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(1123)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==5,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             21              
## type        "eps-regression"
## cost        "2"             
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.1538462"
res_rmse[res_rmse$Group==5,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Comparison

knitr::kable(res_rmse)
Group lm xgb rf svm
0 0.5763262 0.5197590 0.5520113 0.5204621
1 0.5704792 0.5089603 0.5430797 0.5125682
2 0.5606448 0.4970774 0.5118637 0.5024905
3 0.5686515 0.4987465 0.5141315 0.5042145
4 0.5587960 0.4940864 0.5071319 0.4961323
5 0.5587960 0.4895826 0.5061418 0.4967642
6 NA NA NA NA
7 NA NA NA NA
8 NA NA NA NA

Predictions of (raw - age polynomial - g_crim_inv) using several ML methods

There are a few groups of predictors we will use: * Group 6: using criminal history (only age variables) * Group 7: without age variables (only criminal history) * Group 8: with everything (age and criminal history)

Race not included in any of groups 6-8.

Group 6 models: predicting (raw score - age spline - g_crim_inv) using only age and age at first arrest

### Create group 6 training data
## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
     p_age_first_offense,
    #p_arrest,
    #p_jail30,
    #p_prison,
    #p_probation,
    raw_score__age_spline__crim_inv_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline__crim_inv_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline__crim_inv_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline__crim_inv_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline__crim_inv_spline ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5944 -0.4298 -0.0861  0.3402  2.5345 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.0696061  0.0175379  60.988  < 2e-16 ***
## p_current_age       -0.0064197  0.0008706  -7.374 1.81e-13 ***
## p_age_first_offense  0.0018659  0.0009013   2.070   0.0385 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.568 on 9039 degrees of freedom
## Multiple R-squared:  0.01081,    Adjusted R-squared:  0.01059 
## F-statistic:  49.4 on 2 and 9039 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==6,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  13          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__crim_inv_spline
res_rmse[res_rmse$Group==6,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(784)
mdl_rf = randomForest(
  formula = raw_score__age_spline__crim_inv_spline ~ .,
  data = train
)
res_rmse[res_rmse$Group==6,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline__crim_inv_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             24              
## type        "eps-regression"
## cost        "2"             
## epsilon     "1"             
## gamma_scale "2"             
## gamma       "0.6666667"
res_rmse[res_rmse$Group==6,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)

Group 7 models: predicting (raw score - age spline - g_crim_inv) without using current age, age at first offense or race

### Create group 7 training data
## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    # p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison,
    p_probation,
    raw_score__age_spline__crim_inv_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline__crim_inv_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline__crim_inv_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline__crim_inv_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline__crim_inv_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.80776 -0.43212 -0.07162  0.35501  2.40310 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.971128   0.006747 143.926  < 2e-16 ***
## p_arrest    -0.018578   0.001930  -9.626  < 2e-16 ***
## p_jail30    -0.097527   0.039446  -2.472   0.0134 *  
## p_prison     0.092803   0.008535  10.873  < 2e-16 ***
## p_probation -0.043673   0.008084  -5.402 6.74e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5582 on 9037 degrees of freedom
## Multiple R-squared:  0.04486,    Adjusted R-squared:  0.04443 
## F-statistic: 106.1 on 4 and 9037 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==7,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  6           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__crim_inv_spline
res_rmse[res_rmse$Group==7,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(784)
mdl_rf = randomForest(
  formula = raw_score__age_spline__crim_inv_spline ~ .,
  data = train
)
res_rmse[res_rmse$Group==7,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline__crim_inv_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             23              
## type        "eps-regression"
## cost        "1"             
## epsilon     "1"             
## gamma_scale "2"             
## gamma       "0.4"
res_rmse[res_rmse$Group==7,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)

Group 8 models: predicting (raw score - age spline - g_crim_inv) using everything

### Create group 8 training data
## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison,
    p_probation,
    raw_score__age_spline__crim_inv_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline__crim_inv_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline__crim_inv_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline__crim_inv_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline__crim_inv_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.86075 -0.42502 -0.07299  0.34222  2.55373 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.140689   0.017385  65.615  < 2e-16 ***
## p_current_age        0.005347   0.001170   4.572 4.90e-06 ***
## p_age_first_offense -0.011605   0.001227  -9.454  < 2e-16 ***
## p_arrest            -0.024493   0.002014 -12.160  < 2e-16 ***
## p_jail30            -0.065995   0.039252  -1.681   0.0927 .  
## p_prison             0.087091   0.008741   9.964  < 2e-16 ***
## p_probation         -0.056627   0.008198  -6.907 5.28e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5531 on 9035 degrees of freedom
## Multiple R-squared:  0.06265,    Adjusted R-squared:  0.06202 
## F-statistic: 100.6 on 6 and 9035 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==8,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__crim_inv_spline
res_rmse[res_rmse$Group==8,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(784)
mdl_rf = randomForest(
  formula = raw_score__age_spline__crim_inv_spline ~ .,
  data = train
)
res_rmse[res_rmse$Group==8,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline__crim_inv_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             24              
## type        "eps-regression"
## cost        "2"             
## epsilon     "1"             
## gamma_scale "2"             
## gamma       "0.2857143"
res_rmse[res_rmse$Group==8,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)

Comparison

knitr::kable(res_rmse)
Group lm xgb rf svm
0 0.5763262 0.5197590 0.5520113 0.5204621
1 0.5704792 0.5089603 0.5430797 0.5125682
2 0.5606448 0.4970774 0.5118637 0.5024905
3 0.5686515 0.4987465 0.5141315 0.5042145
4 0.5587960 0.4940864 0.5071319 0.4961323
5 0.5587960 0.4895826 0.5061418 0.4967642
6 0.5679499 0.5520712 0.5619801 0.5551400
7 0.5580910 0.5192577 0.5320781 0.5221024
8 0.5528691 0.4978607 0.5112501 0.5063758

Predictions using xgboost only

We use the “Group 3” models but predict the raw score and the raw score minus all lower bounds we fitted. Results from this section can be combined with Group 3 xgboost results above where we predicted the raw score minus the age lower bound.

Predicting the raw score

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison,
    p_probation,
    raw_score)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score) %>% as.matrix(),
  "label" = train %>% select(raw_score) %>% as.matrix()
)
set.seed(540)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  13          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score

print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4953"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("General score") +
  ylab("XGBoost prediction")+
  #annotate("text", x = -3.5, y = 0.5, label = paste("RMSE:",round(rmse(pred, actual),4)))+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

ggsave("Figures/raw_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")

Predicting the raw score - f(age) - g(crim_inv)

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_arrest,
    p_jail30,
    p_prison,
    p_probation,
    raw_score__age_spline__crim_inv_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline__crim_inv_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline__crim_inv_spline) %>% as.matrix()
)
set.seed(7301)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__crim_inv_spline

print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4987"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age]~-~g[crim~inv])) +
  ylab("XGBoost prediction")+
  #annotate("text", x = 0.5, y = 4, label = paste("RMSE:",round(rmse(pred, actual),4)))+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

ggsave("Figures/raw-fage-gcriminv_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")

Replicating ProPublica logistic regression

propub = features_filt %>%
  filter(filt4) %>% # Only people with valid recidivism values
  mutate(age_low = if_else(p_current_age < 25,1,0), 
         age_high = if_else(p_current_age > 45,1,0),
         female = if_else(sex=="Female",1,0),
         n_priors = p_felony_count_person + p_misdem_count_person,
         compas_high = if_else(`Risk of Recidivism_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
         race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis

print(paste("Number of observations for logistic regression:",nrow(propub)))
## [1] "Number of observations for logistic regression: 5759"
mdl_glm = glm(compas_high ~
                female +
                age_high +
                age_low +
                as.factor(race) +
                p_arrest +
                is_misdem +
                recid,
                family=binomial(link='logit'), data=propub)

summary(mdl_glm)
## 
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) + 
##     p_arrest + is_misdem + recid, family = binomial(link = "logit"), 
##     data = propub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3690  -0.7420  -0.2762   0.8256   2.7371  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -1.63647    0.08358 -19.579  < 2e-16 ***
## female                           0.18584    0.08647   2.149   0.0316 *  
## age_high                        -1.61708    0.13790 -11.727  < 2e-16 ***
## age_low                          1.55708    0.07244  21.495  < 2e-16 ***
## as.factor(race)African-American  0.47448    0.07431   6.385 1.71e-10 ***
## as.factor(race)Asian            -0.27638    0.51050  -0.541   0.5882    
## as.factor(race)Hispanic         -0.26119    0.13121  -1.991   0.0465 *  
## as.factor(race)Native American   0.45819    0.66743   0.687   0.4924    
## as.factor(race)Other            -0.73909    0.16196  -4.563 5.03e-06 ***
## p_arrest                         0.30132    0.01224  24.616  < 2e-16 ***
## is_misdem                       -0.46844    0.07071  -6.625 3.47e-11 ***
## recid                            0.50614    0.06974   7.257 3.95e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7907.2  on 5758  degrees of freedom
## Residual deviance: 5492.4  on 5747  degrees of freedom
## AIC: 5516.4
## 
## Number of Fisher Scoring iterations: 5